import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df1=pd.read_csv("NASA_Exoplanet_Composite.csv") #5483
df2= pd.read_csv("NASA_Exoplanet_Planetary.csv")
C:\Users\User\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3165: DtypeWarning: Columns (4,5) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5483 entries, 0 to 5482 Columns: 313 entries, rowid to pl_ntranspec dtypes: float64(212), int64(23), object(78) memory usage: 13.1+ MB
df2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 34951 entries, 0 to 34950 Columns: 287 entries, rowid to pl_ntranspec dtypes: float64(235), int64(25), object(27) memory usage: 76.5+ MB
columns_df1 = set(df1.columns)
columns_df2 = set(df2.columns)
# Find matching columns
matching_columns = columns_df1.intersection(columns_df2)
print("Matching Columns:")
print(len(matching_columns))
Matching Columns: 254
planets=pd.merge(df1, df2,how="outer")
planets.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 40434 entries, 0 to 40433 Columns: 346 entries, rowid to releasedate dtypes: float64(237), int64(23), object(86) memory usage: 107.0+ MB
#finding empty columns
empty_columns = [col for col in planets.columns if planets[col].isna().all()]
print("Completely Empty Columns:")
print(empty_columns)
Completely Empty Columns: ['sy_kepmagerr1', 'sy_kepmagerr2']
p2=planets.drop(empty_columns, axis=1)
p2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 40434 entries, 0 to 40433 Columns: 344 entries, rowid to releasedate dtypes: float64(235), int64(23), object(86) memory usage: 106.4+ MB
threshold = 20000
# Identify columns to drop
columns_to_drop = [col for col in p2.columns if p2[col].isna().sum() >threshold]
# Drop the columns from the DataFrame
p3 = p2.drop(columns=columns_to_drop)
print(columns_to_drop)
print("Updated DataFrame shape:", p3.shape)
# Optionally save the cleaned dataset
['hd_name', 'hip_name', 'pl_orbper_reflink', 'pl_orbsmaxerr1', 'pl_orbsmaxerr2', 'pl_orbsmax_reflink', 'pl_rade_reflink', 'pl_radj', 'pl_radjerr1', 'pl_radjerr2', 'pl_radjlim', 'pl_radj_reflink', 'pl_bmasse', 'pl_bmasseerr1', 'pl_bmasseerr2', 'pl_bmasselim', 'pl_bmasse_reflink', 'pl_bmassj', 'pl_bmassjerr1', 'pl_bmassjerr2', 'pl_bmassjlim', 'pl_bmassj_reflink', 'pl_bmassprov', 'pl_dens', 'pl_denserr1', 'pl_denserr2', 'pl_denslim', 'pl_dens_reflink', 'pl_orbeccenerr1', 'pl_orbeccenerr2', 'pl_orbeccen_reflink', 'pl_insol', 'pl_insolerr1', 'pl_insolerr2', 'pl_insol_reflink', 'pl_eqt', 'pl_eqterr1', 'pl_eqterr2', 'pl_eqt_reflink', 'pl_orbincl', 'pl_orbinclerr1', 'pl_orbinclerr2', 'pl_orbincl_reflink', 'pl_tranmid_systemref', 'pl_tranmid_reflink', 'pl_imppar', 'pl_impparerr1', 'pl_impparerr2', 'pl_imppar_reflink', 'pl_trandeperr1', 'pl_trandeperr2', 'pl_trandep_reflink', 'pl_trandur_reflink', 'pl_ratdor', 'pl_ratdorerr1', 'pl_ratdorerr2', 'pl_ratdor_reflink', 'pl_ratror_reflink', 'pl_occdep', 'pl_occdeperr1', 'pl_occdeperr2', 'pl_occdeplim', 'pl_occdep_reflink', 'pl_orbtper', 'pl_orbtpererr1', 'pl_orbtpererr2', 'pl_orbtperlim', 'pl_orbtper_systemref', 'pl_orbtper_reflink', 'pl_orblper', 'pl_orblpererr1', 'pl_orblpererr2', 'pl_orblperlim', 'pl_orblper_reflink', 'pl_rvamp', 'pl_rvamperr1', 'pl_rvamperr2', 'pl_rvamplim', 'pl_rvamp_reflink', 'pl_projobliq', 'pl_projobliqerr1', 'pl_projobliqerr2', 'pl_projobliqlim', 'pl_projobliq_reflink', 'pl_trueobliq', 'pl_trueobliqerr1', 'pl_trueobliqerr2', 'pl_trueobliqlim', 'pl_trueobliq_reflink', 'st_spectype', 'st_spectype_reflink', 'st_teff_reflink', 'st_rad_reflink', 'st_mass_reflink', 'st_met_reflink', 'st_lum', 'st_lumerr1', 'st_lumerr2', 'st_lumlim', 'st_lum_reflink', 'st_logg_reflink', 'st_age', 'st_ageerr1', 'st_ageerr2', 'st_age_reflink', 'st_denserr1', 'st_denserr2', 'st_dens_reflink', 'st_vsin', 'st_vsinerr1', 'st_vsinerr2', 'st_vsinlim', 'st_vsin_reflink', 'st_rotp', 'st_rotperr1', 'st_rotperr2', 'st_rotplim', 'st_rotp_reflink', 'st_radv', 'st_radverr1', 'st_radverr2', 'st_radvlim', 'st_radv_reflink', 'ra_reflink', 'sy_pm_reflink', 'sy_dist_reflink', 'sy_plx_reflink', 'sy_bmag_reflink', 'sy_vmag_reflink', 'sy_jmag_reflink', 'sy_hmag_reflink', 'sy_kmag_reflink', 'sy_umag', 'sy_umagerr1', 'sy_umagerr2', 'sy_umag_reflink', 'sy_gmag', 'sy_gmagerr1', 'sy_gmagerr2', 'sy_gmag_reflink', 'sy_rmag', 'sy_rmagerr1', 'sy_rmagerr2', 'sy_rmag_reflink', 'sy_imag', 'sy_imagerr1', 'sy_imagerr2', 'sy_imag_reflink', 'sy_zmag', 'sy_zmagerr1', 'sy_zmagerr2', 'sy_zmag_reflink', 'sy_w1mag_reflink', 'sy_w2mag_reflink', 'sy_w3mag_reflink', 'sy_w4magerr1', 'sy_w4magerr2', 'sy_w4mag_reflink', 'sy_gaiamag_reflink', 'sy_icmag', 'sy_icmagerr1', 'sy_icmagerr2', 'sy_icmag_reflink', 'sy_tmag_reflink', 'sy_kepmag_reflink', 'pl_masse', 'pl_masseerr1', 'pl_masseerr2', 'pl_masselim', 'pl_massj', 'pl_massjerr1', 'pl_massjerr2', 'pl_massjlim', 'pl_msinie', 'pl_msinieerr1', 'pl_msinieerr2', 'pl_msinielim', 'pl_msinij', 'pl_msinijerr1', 'pl_msinijerr2', 'pl_msinijlim', 'pl_cmasse', 'pl_cmasseerr1', 'pl_cmasseerr2', 'pl_cmasselim', 'pl_cmassj', 'pl_cmassjerr1', 'pl_cmassjerr2', 'pl_cmassjlim'] Updated DataFrame shape: (40434, 155)
p3.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 40434 entries, 0 to 40433 Columns: 155 entries, rowid to releasedate dtypes: float64(109), int64(23), object(23) memory usage: 48.1+ MB
#Visualization-1
p3['sy_pnum'].isnull().sum()
0
p3['st_logg'].isnull().sum()
6529
Gravitational Pull of Host Star Vs No.of Planets
#dropping all the rows which are having null values for this'st_logg' column
# VISUALIZATION-1
column_to_clean = 'st_logg' # Replace with the column name
p_clean = p3.dropna(subset=[column_to_clean])
plt.figure(figsize=(10, 6))
sns.lineplot(x=p_clean['st_logg'], y=p_clean['sy_pnum'], alpha=0.6, color='green')
plt.title('Gravitational Pull vs Number of Planets', fontsize=14)
plt.xlabel('Gravitational Pull ', fontsize=12)
plt.ylabel('Number of Planets', fontsize=12)
plt.grid(True)
plt.show()
Discovery Methods vs Luminosity
# VISUALIZATION-2
p2['st_lum'].isnull().sum()
26431
#dropping all the rows which are having null values for this'st_lum' column
column_to_clean = 'st_lum' # Replace with the column name
p_clean = p2.dropna(subset=[column_to_clean])
#For Microlensing only 4 records found, Astrometry 2 records found,Disk Kinematics 2 records found..
#We are removing them in order to get Appropriate Visualizations...
values_to_remove = ["Microlensing","Astrometry","Disk Kinematics"]
p_clean['discoverymethod'] = p_clean['discoverymethod'].replace(values_to_remove, np.nan)
# Drop rows with NaN in the 'discoverymethod' column
p1_clean = p_clean.dropna(subset=['discoverymethod'])
<ipython-input-17-9ec2c2650897>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy p_clean['discoverymethod'] = p_clean['discoverymethod'].replace(values_to_remove, np.nan)
plt.figure(figsize=(20, 10))
sns.boxplot(x=p1_clean['discoverymethod'], y=p1_clean['st_lum'], color='green')
plt.title('Discovery Methods vs Luminosity', fontsize=14)
plt.xlabel('Discovery Methods', fontsize=12)
plt.ylabel('Luminosity', fontsize=12)
plt.grid(True)
plt.show()
import plotly.express as px
Discovery Method Vs Planet Types
# VISUALIZATION -3
if 'discoverymethod' in p2.columns and 'pl_letter' in p2.columns:
# Count the occurrences of each planet type by discovery method
planet_type_by_method = p2.groupby(['discoverymethod', 'pl_letter']).size().reset_index(name='count')
# Create a bar chart to visualize the count of planet types by discovery method
fig = px.bar(planet_type_by_method, x='discoverymethod', y='count', color='pl_letter',
title='Discovery Methods vs Planet Types',
labels={'discoverymethod': 'Discovery Method', 'pl_letter': 'Planet Type', 'count': 'Count of Planets'},
barmode='stack')
fig.update_layout(
yaxis=dict(tickmode='linear', tick0=0, dtick=1000),# dtick sets the difference between ticks
width=800, # Set width of the plot
height=600
)
fig.show()
else:
print("Columns 'discoverymethod' or 'pl_type' not found in the dataset.")
Stellar Luminosity vs Metallicity
p2["st_lum"].isnull().sum()
26431
p2["st_met"].isnull().sum()
13385
filtered_data = p2[['st_lum', 'st_met']].dropna() # removing all the rows with the null values of the mentioned columns
# Bin the metallicity values into discrete categories for the box plot
filtered_data['st_met_bins'] = pd.cut(filtered_data['st_met'], bins=5)
# Create the box plot
plt.figure(figsize=(12, 6))
sns.boxplot(data=filtered_data, x='st_met_bins', y='st_lum')
plt.title('Box Plot of Stellar Luminosity vs Metallicity Bins', fontsize=14)
plt.xlabel('Stellar Metallicity Bins [st_met]', fontsize=12)
plt.ylabel('Stellar Luminosity [st_lum]', fontsize=12)
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.show()
Planet Discovery Locations in Sky
fig = px.scatter_geo(
p3,
lat="glat",
lon="glon",
color="pl_letter", # Color by planet type (e.g., 'pl_letter' or other suitable column)
#size="pl_rade", # Optional: size of the points according to planet radius
hover_name="pl_name",
title="Planet Discovery Locations in the Sky",
opacity=0.2 # Set opacity of the dots to 50%
)
# Increase the size of the plot
fig.update_layout(
width=800, # Set the width of the plot (pixels)
height=800, # Set the height of the plot (pixels)
)
# Show the plot
fig.show()
Distribution of Planet Types
fig = px.pie(p3, names="pl_letter", title="Distribution of Planet Types")
fig.show()
No.of Exo Planets Discovered Over Time
discovery_trend = df1.groupby('disc_year').size()
# Plotting
plt.figure(figsize=(10, 4))
sns.barplot(x=discovery_trend.index, y=discovery_trend.values, palette='viridis')
plt.title('Number of Exoplanets Discovered Over Time', fontsize=16)
plt.xlabel('Discovery Year', fontsize=12)
plt.ylabel('Number of Exoplanets', fontsize=12)
plt.xticks(rotation=45)
plt.show()